#!/bin/bash

# This file parses UCSC Chromosome Band table into a Go source code file.
#
# This script will only work on the Assembly table.
#
# The prefix, e.g. chr, will be used to label the chromosomes (e.g. chr1, chr2 ... )
# By default, "chr" is used. The package will be used to name the generated package.
#
# To download data tables, see http://genome.ucsc.edu/cgi-bin/hgTables
#
# USE OF THIS SCRIPT WITHOUT A FILTER OR WITH NOFRAG UNSET
# SHOULD IN MOST CASES BE SEEN AS COMPILER ABUSE.

file=$1
prefix=$2
soecies=$3
package=$4
filter=$5
nofrags=$6

if [ -z "$file" ]; then
	echo "Please specify the UCSC assembly table file"
	exit
fi

if [ -z "$prefix" ]; then
	prefix="chr"
fi

if [ -z "$filter" ]; then
	filter="^$"
fi

label="$(tr '[:lower:]' '[:upper:]' <<< ${prefix:0:1})${prefix:1}"

(
	echo -e "// DO NOT EDIT. This file was autogenerated by parse.assembly\n"
	echo "// Package $package defines chromosome and assembly fragment intervals for the $package genome assembly for $species."
	echo -e "package $package\n"

	echo "import ("
	if [ -z "$nofrags" ]; then
		echo -e "\t\"code.google.com/p/biogo/feat\""
	fi
	echo -e "\t\"code.google.com/p/biogo/feat/genome\"\n)\n"

	# chromosomes
	echo 'var ('
	< $file zcat \
	| grep -v '^#' \
	| grep -v $filter \
	| sed -e 's/\t/ /g' -e 's/chr//' | tr -s ' ' \
	| awk '{print $2,$0}' \
	| sed -e 's/^[XZ]/1.1e10/' -e 's/^[YW]/1.2e10/' -e 's/^M/1.3e10/' -e's/^Un[^ ]\+/2e10/' \
	| sed -e 's/^\([1-9][0-9]*\)L/\1/i' -e 's/^\([1-9][0-9]*\)R/\1.5/i' \
	| sort -k1,1g -k5rn,5 \
	| sort -k1,1g -k3,3 -u \
	| awk -v prefix=$prefix -v label=$label '{print "\t"label$3" = genome.Chromosome{Chr: \""prefix$3"\", Desc: \"Chromosome\", Length:",$5"}"}'
	echo -e ')\n'
	echo 'var Chromosomes = []*genome.Chromosome{'
	< $file zcat \
	| grep -v '^#' \
	| grep -v $filter \
	| sed -e 's/\t/ /g' -e 's/chr//' | tr -s ' ' \
	| awk '{print $2,$0}' \
	| sed -e 's/^[XZ]/1.1e10/' -e 's/^[YW]/1.2e10/' -e 's/^M/1.3e10/' -e's/^Un[^ ]\+/2e10/' \
	| sed -e 's/^\([1-9][0-9]*\)L/\1/i' -e 's/^\([1-9][0-9]*\)R/\1.5/i' \
	| sort -k1,1g -k5rn,5 \
	| sort -k1,1g -k3,3 -u \
	| awk  -v label=$label '{print "\t&"label$3","}'
	echo -e '}\n'

	# fragments
	if [ -z "$nofrags" ]; then
		echo 'var ('
		< $file zcat \
		| grep -v '^#' \
		| grep -v $filter \
		| sed -e 's/\t/ /g' -e 's/chr//' | tr -s ' ' \
		| awk '{print $2,$0}' \
		| sed -e 's/^[XZ]/1.1e10/' -e 's/^[YW]/1.2e10/' -e 's/^M/1.3e10/' -e's/^Un[^ ]\+/2e10/' \
		| sed -e 's/^\([1-9][0-9]*\)L/\1/i' -e 's/^\([1-9][0-9]*\)R/\1.5/i' \
		| sort -k1,1g -k3,3 \
		| awk -v prefix=$prefix -v label=$label '{print "\t"label$3"_"$8"_"$4" = genome.Fragment{Frag: \""$8"\", Desc: \"Fragment\", Chr: &"label$3", ChrStart:",$4", ChrEnd: "$5", FragStart:",$9", FragEnd: "$10", Type: \x27"$7"\x27, Strand:",$11"1}"}' \
		| sed 's/\.\(.*=\)/_\1/'
		echo -e ')\n'
		echo 'var Fragments = []*genome.Fragment{'
		< $file zcat \
		| grep -v '^#' \
		| grep -v $filter \
		| sed -e 's/\t/ /g' -e 's/chr//' | tr -s ' ' \
		| awk '{print $2,$0}' \
		| sed -e 's/^[XZ]/1.1e10/' -e 's/^[YW]/1.2e10/' -e 's/^M/1.3e10/' -e's/^Un[^ ]\+/2e10/' \
		| sed -e 's/^\([1-9][0-9]*\)L/\1/i' -e 's/^\([1-9][0-9]*\)R/\1.5/i' \
		| sort -k1,1g -k3,3 \
		| awk -v prefix=$prefix -v label=$label '{print "\t&"label$3"_"$8"_"$46","}' \
		| sed 's/\./_/'
		echo -e '}\n'

		# init
		cat << 'END'
//line parse.assembly:99
func init() {
	for _, b := range Fragments {
		b.Chr.(*genome.Chromosome).Features = append(b.Chr.(*genome.Chromosome).Features, b)
	}
	for _, c := range Chromosomes {
		fc := make([]feat.Feature, len(c.Features))
		copy(fc, c.Features)
		c.Features = fc
	}
}
END
	fi
) | gofmt
